Amiga Format CD 41

home *** CD-ROM | disk | FTP | other *** search

/ Amiga Format CD 41 / Amiga Format CD41 (1999-06)(Future Publishing)(GB)[!][issue 1999-07].iso / -seriously_amiga- / misc / wordwrap / wordwrap.c < prev next >

Wrap

C/C++ Source or Header | 1999-04-19 | 24KB | 695 lines

/*-----------------------------------------------------------------------*\ | File: wordwrap.c | | Author: Wilhelm Nöker <wnoeker@t-online.de> | *-----------------------------------------------------------------------* | Reformat a text on the input stream to a given maximum line length. | | Lots of command line options, see help() for details. | | | \*-----------------------------------------------------------------------*/ #include <stdio.h> #include <string.h> #include <stdlib.h> #include <ctype.h> char version[] = "$VER: wordwrap 2.3 (30.03.99)"; #define WORDLEN 200 /* max. size of an input word */ #define KEYLEN 40 /* max. size of keywords (for -e/-aA/-zZ) */ #define NDELIM 20 /* max. number of delimiter words */ #define NKEEP 100 /* max. number of escaped paragraphs */ typedef unsigned char UBYTE; /* will make the lookup tables work better */ /* lookup tables: */ int is_graph[ 256 ], is_alnum[ 256 ], is_lower[ 256 ], is_cons[ 256 ]; /* adjustable parameters: */ UBYTE solword[ NDELIM ][ KEYLEN ], eolword[ NDELIM ][ KEYLEN ]; int scmplen[ NDELIM ], ecmplen[ NDELIM ]; int eolwords = 0, solwords = 0; int keeppar[ NKEEP ]; int keeppars = 0; int lmax = 75, shortchars = 0, indent = 0, blank2i = 0, tabsize = 4; unsigned int imargin = 0, omargin = 0; /* switches: */ int blanks = 0, i2blank = 0, smart_b2i = 0; int widespace = 0, unhyphen = 0, wrapnbsp = 0; void init_tabs(); void wordwrap(); int getword(UBYTE s[], int lim); void scrollmode( int imrg, int omrg ); void help( UBYTE *s ) /* print a help text and nag about illegal parameter <s> */ { if( s ) fprintf( stderr, "illegal option '%s'\n", s ); fprintf( stderr, "'wordwrap' command line parameters:\n"); fprintf( stderr, " -l<len> line length, defaults to 75\n"); fprintf( stderr, " -b / -bc[<indent>] protect blank lines / convert to indentation\n"); fprintf( stderr, " -bC[<indent>] like -bc, but convert single blank lines only\n"); fprintf( stderr, " -i / -i<indent> / -ic protect indentation / convert to blank lines\n"); fprintf( stderr, " -ia / -ia<indent> add blank lines before indentations\n"); fprintf( stderr, " -t<tabsize> how to expand tab indentations\n"); fprintf( stderr, " -m<width> / -M<width> add / strip left margin\n"); fprintf( stderr, " -s<len> protect lines shorter than <len>\n"); fprintf( stderr, " -e<parno> exempt a paragraph from reformatting\n"); fprintf( stderr, " -h / -H undo hyphenation (preserving hyphens)\n"); fprintf( stderr, " -w / -W wide spaces after '.!?' (and ':')\n"); fprintf( stderr, " -n make non-breakable space breakable\n"); fprintf( stderr, " -a<sstr> / -A<sword> substring / word always starting a new line\n"); fprintf( stderr, " -z<estr> / -Z<eword> substring / word always ending a line\n"); } int main( int argc, char* argv[] ) /* command line parsing */ { UBYTE *s; init_tabs(); while( --argc ) { s = *++argv; if( *s++ == '-' ) { switch( *s++ ) { case 'l': lmax = atoi( s ); break; case 's': shortchars = atoi( s ); break; case 'n': wrapnbsp = 1; break; case 'h': unhyphen = 1; break; case 'H': unhyphen = 2; break; case 'w': widespace = 1; break; case 'W': widespace = 2; break; case 'm': omargin = atoi(s); break; case 'M': imargin = atoi(s); break; case 't': tabsize = atoi(s); break; case 'b': blanks = 1; if( *s == 'c' || (smart_b2i = *s == 'C')) { blank2i = -1; /* will be replaced by a value >0 later */ if( isdigit( *++s ) ) blank2i = atoi( s ); } break; case 'i': indent = -1; /* indent by original width */ if( *s == 'c' ) { indent = 0; /* don't indent */ i2blank = 1; } if( *s == 'a' ) { i2blank = 1; s++; } if( isdigit( *s ) ) indent = atoi( s ); /* forced indentation width */ break; case 'e': if( keeppars < NKEEP ) { int i, p; p = atoi( s ); /* Sort paragraph numbers, by insertion. */ for( i = keeppars; i>0; i-- ) { if( keeppar[ i-1 ] < p ) keeppar[ i ] = keeppar[ i-1 ]; else break; } keeppar[ i ] = p; keeppars++; } else { fprintf( stderr, "too many -e's\n" ); return 10; } break; case 'a': case 'A': if( solwords<NDELIM ) { strncpy( solword[solwords], s, KEYLEN-1 ); scmplen[ solwords++ ] = islower( s[-1] ) ? strlen( s ) : 0; } else { fprintf( stderr, "too many -a/-A's\n" ); return 10; } break; case 'z': case 'Z': if( eolwords<NDELIM ) { strncpy( eolword[eolwords], s, KEYLEN-1 ); ecmplen[ eolwords++ ] = islower( s[-1] ) ? strlen( s ) : 0; } else { fprintf( stderr, "too many -z/-Z's\n" ); return 10; } break; case '?': help( NULL ); return 0; default: help(argv[0]); return 10; } } else { help( argv[0] ); return 10; } } if( blank2i<0 ) { /* -bc without an explicit width */ if( indent>0 ) blank2i = indent; else blank2i = 4; } wordwrap(); return 0; } int parno; /* paragraph number, based on blank lines */ int lline; /* length of the current output line */ UBYTE joinme; /* the last character of a hyphenated word */ void newline() /* little assistant to wordwrap(), not completely trivial */ { int i; if( joinme ) { putchar( '-' ); /* flush pending hyphen */ joinme = 0; } putchar( '\n' ); /* perform line feed */ lline = 0; for( i=1; i<=omargin; i++ ) putchar( ' ' ); /* print left margin */ } void wordwrap() /* central function, copies from <stdin> to <stdout> */ { int i, lword, dented, breakme; UBYTE *s, c, word[WORDLEN]; for( i=1; i<=omargin; i++ ) putchar(' '); /* left margin for the first output line */ /* let's go */ parno = 0; lline = 0; dented = 0; joinme = 0; breakme = 0; while( 1 ) { /* Before we read anything: Maybe we have to process this */ /* paragraph in escape mode? */ if( keeppars>0 && parno == keeppar[ keeppars-1 ] ) { /* Remove this entry from our list. */ /* We do this in a loop to defend against duplicate entries. */ /* Doing it this way is easier than not allowing duplicates */ /* at all. */ while( keeppars>0 && parno == keeppar[ keeppars-1 ] ) keeppars--; if( lline ) { /* Characters on the current output line mean that no */ /* -b option was supplied. But we want a blank line here. */ if( !dented ) newline(); newline(); dented = 0; } scrollmode( imargin, omargin ); /* One trailing blank line after the escaped paragraph has */ /* been processed by scrollmode() itself to create a blank */ /* line. If it should also create an indentation, we'll have */ /* to take care of that now. */ if( blank2i ) { if( !smart_b2i ) while( lline < blank2i ) { putchar(' '); lline++; } dented = 1; } continue; } /* Now read a word. */ lword = getword( word, WORDLEN ); if( lword == 0 ) break; /* EOF, quit */ breakme = (word[ lword-1 ] == '\n'); if( breakme ) /* "short line" break request? */ word[ --lword ] = '\0'; /* Three main cases: */ /* 1) this was a blank line */ if( lword == 0 ) { if( blank2i ) { if( lline ) newline(); if( !(smart_b2i && dented) ) while( lline < blank2i ) { putchar(' '); lline++; } dented = 1; } else if( blanks ) { if( lline ) newline(); newline(); } } /* 2) this was the start of an indented line */ else if( !is_graph[ word[ 0 ] ] ) { if( (indent || i2blank) && lword>imargin ) { if( lline ) newline(); dented = 1; if( i2blank ) newline(); if( indent>0 ) for( lline = 0; lline<indent; lline++ ) putchar(' '); else if( indent ) { printf( "%s", &word[ imargin ] ); lline = lword-imargin; } } } /* 3) a regular word */ else { /* however, it might still be a "delimiter" word: */ /* ... one that always ends a line? */ for( i=0; i<eolwords; i++ ) { s = word; if( ecmplen[i] && lword > ecmplen[i] ) s += lword-ecmplen[i]; if( strcmp( s, eolword[i] ) == 0 ) breakme = 1; /* treat it like a "short line" */ } /* ... or one that must be at the start of a line? */ for( i=0; i<solwords && lline; i++ ) if( (scmplen[i] && strncmp( word, solword[i], scmplen[i] ) == 0 ) || strcmp( word, solword[i] ) == 0 ) newline(); /* and we still might have to process the "hyphen" marker: */ if( (c = word[ lword-1 ]) == '\t' ) word[ --lword ] = '\0'; /* now, finally, print the word: */ if( lline == 0 || dented ) { /* at the start of the line */ printf("%s", word); lline += lword; dented = 0; } else { /* (attempt to) append to an existing line */ if( unhyphen<2 && is_lower[ joinme ] && is_lower[ word[0] ] && lline+lword <= lmax ) ; /* this will join a previously hyphenated word */ else if( joinme && is_alnum[word[0]] && lline+lword < lmax ) { /* join a word, preserving the hyphen */ putchar('-'); lline++; } else { if( joinme ) { putchar('-'); lline++; joinme = 0; } if( lline+lword < lmax ) { /* standard case: insert a space between words */ putchar(' '); lline++; } else /* or start a new line */ newline(); } printf("%s", word); lline += lword; } /* hyphen marker processing, part 2: */ if( c=='\t' ) joinme = word[ lword-1 ]; else joinme = 0; /* pending "short line" break? */ if( breakme ) newline(); } } if( lline ) { if( joinme ) putchar( '-' ); putchar( '\n' ); } } /*-----------------------------------------------------------------------*\ A filtering frontend for getchar(), which silently drops some stuff that getword() does not want to see, and does some conversions. - ANSI escapes (stuff starting with "\e[" or 0x9b, ending with a character >='@') are dropped - printing characters as well as ' ', '\t', '\n' go unharmed - non-breakable space (0xa0) may be converted to ' ' if requested - '\r' and '\v' are converted to ' ' - '\f' is converted to '\n' - all other non-printing characters (including DEL) are dropped Note that getword() assumes that this is built on top of getchar() and nothing else, as it calls ungetc( stdin ) directly at some point. \*-----------------------------------------------------------------------*/ int get_char() { int c; loop: c = getchar(); check: if( c == 0x7f ) /* DEL */ goto loop; if( c == 0xa0 && wrapnbsp ) return ' '; /* make nbsp breakable */ if( c == EOF || (c & 0x60) ) return c; switch( c ) { case '\t': case '\n': return c; case '\v': case '\r': return ' '; case '\f': return '\n'; case '\e': c = getchar(); if( c != '[' ) goto check; /* got "\e[", drop through to 0x9b */ case 0x9b: do { c = getchar(); } while( c != EOF && c < '@' ); } goto loop; } /*-----------------------------------------------------------------------*\ Copies one word of the input stream to s[] (return value is strlen(s)), then skips all subsequent spaces, stopping at the next word or EOL. - will return "\n" for an empty line - will return a string of spaces for a line starting with blanks - will append " " to a word ending a sentence (if widespace != 0) - will replace a trailing hyphen by "\t", (if at EOL and unhyphen != 0) - will append "\n" to the last word on a "short" line - will return an empty string at EOF Note that the combinations " \n" and "\t\n" may very well occur, whereas combinations of " " and "\t" won't. Reading an empty line will also bump the paragraph counter <parno>, *if* it was the first blank line after a regular one. \*-----------------------------------------------------------------------*/ int getword( UBYTE s[], int lim ) { int c, j, i = 0; static int nonblanks = 0; static int inbetween = 1; /* between paragraphs? */ c = get_char(); if( c == EOF ) { s[ 0 ] = '\0'; return 0; } if( !is_graph[ c ] ) /* This can only be at the start of a line, */ { /* -> indented (or even a blank) line. */ while( !is_graph[ c ] && c != '\n' ) { j = ( c == '\t' ) ? tabsize : 1; while( j-- ) if( i<lim-1 ) s[ i++ ] = ' '; c = get_char(); } if( c == '\n' || c == EOF ) { i = 0; s[ i++ ] = '\n'; /* blank line */ } } else { /* read a word of non-blanks */ while( is_graph[ c ] ) { if( i<lim-1 ) s[i++] = c; else { fprintf( stderr, "warning: long input word (> %d chars)\n", lim ); break; } c = get_char(); } nonblanks += i; /* skip blanks after the word */ while( !is_graph[ c ] && c != '\n' ) c = get_char(); if( widespace ) /* check for end of sentence */ if( i>2 && is_alnum[ s[ i-3 ] ] && !(i == 3 && is_cons[s[i-3]] && is_cons[s[i-2]]) ) /* (makes sure that words like "g.", "e.g." and "Dr." cannot create extra wide space) */ switch( s[ i-1 ] ) { case '.': case '!': case '?': s[i++] = ' '; break; case ':': if( widespace>1 ) s[i++] = ' '; break; } if( c == '\n' ) /* Just read the last word on this line */ { /* did we read the first half of a split word? */ if( unhyphen && i>1 && s[ i-1 ] == '-' && is_alnum[ s[ i-2 ] ] ) s[i-1] = '\t'; /* was this a "short line"? */ if( nonblanks <= shortchars ) s[i++] = '\n'; nonblanks = 0; } } if( is_graph[ c ] ) /* did we stop on a non-blank character? */ ungetc( c, stdin ); s[ i ] = '\0'; if( *s == '\n' ) /* maybe bump the paragraph counter */ { if( !inbetween ) parno++; inbetween = 1; } else inbetween = 0; return i; } /*-----------------------------------------------------------------------*\ Copy a paragraph line by line, only adjusting the left margin. Stops after reading (and printing) a blank line, but not before processing at least one line containing printable text. To be consistent with what the wordwrap() body does, we will assume that the left margin has already been set for our first line, and we will set a left margin when we quit. Calling this function will always bump the paragraph counter. \*-----------------------------------------------------------------------*/ void scrollmode( int imrg, int omrg ) { int c, i, nonblanks = 0, anything = 0; fflush( stdout ); fprintf( stderr, "\e[2m" ); /* highlight console output */ fflush( stderr ); while( (c = get_char()) != EOF ) { /* Margin stripping. */ i = imrg; while( i > 0 && c != EOF && c != '\n' && !is_graph[ c ] ) { i -= ( c == '\t' ) ? tabsize : 1; c = get_char(); } /* Usually we'll now have i=0, but for example stripping three */ /* spaces of one tab might leave us with i=-1. In that case the */ /* following code would produce one space. */ while( i++ < 0 ) putchar(' '); /* Copy a line. */ nonblanks = 0; while( c != '\n' && c != EOF ) { putchar( c ); if( is_graph[ c ] ) nonblanks++; c = get_char(); } newline(); /* this also creates a left margin */ /* Was this a blank line, and should we quit? */ if( nonblanks ) anything = 1; else if( anything ) break; } fflush( stdout ); fprintf( stderr, "\e[0m" ); /* normalize console output */ fflush( stderr ); parno++; } /*-----------------------------------------------------------------------*\ Set up some tables to classify characters, assuming 8 bit ECMA / ISO Latin-1 charset. \*-----------------------------------------------------------------------*/ void init_tabs() { int c; /* Start by filling all tables with "FALSE". */ for( c=0; c<256; c++ ) { is_graph[ c ] = 0; is_alnum[ c ] = 0; is_lower[ c ] = 0; is_cons[ c ] = 0; } /* Printable, non-blank characters: */ for( c=33; c<127; c++ ) /* skip sp (32) and del (127) */ is_graph[ c ] = 1; for( c=160; c<256; c++ ) /* we consider nbsp (160) non-blank! */ is_graph[ c ] = 1; /* Alphanumeric characters: */ for( c='a'; c<='z'; c++ ) /* standard ASCII */ is_alnum[ c ] = 1; for( c='A'; c<='Z'; c++ ) is_alnum[ c ] = 1; for( c='0'; c<='9'; c++ ) is_alnum[ c ] = 1; for( c=192; c<256; c++ ) /* ECMA stuff */ is_alnum[ c ] = 1; is_alnum[ 247 ] = 0; is_alnum[ 215 ] = 0; /* "÷", "×" */ /* Lowercase letters: */ for( c='a'; c<='z'; c++ ) is_lower[ c ] = 1; for( c=224; c<256; c++ ) is_lower[ c ] = 1; is_lower[ 247 ] = 0; /* "÷" */ is_lower[ 223 ] = 1; /* "ß" */ /* Consonants: */ for( c='a'; c<='z'; c++ ) /* standard ASCII */ switch(c) { case 'a': case 'e': case 'i': case 'o': case 'u': case 'y': break; default: is_cons[ c ] = 1; } for( c=224; c<256; c++ ) /* ECMA characters */ switch(c) { case 231: case 240: /* "ç", "ð", "ñ", "þ" */ case 241: case 254: is_cons[ c ] = 1; break; default: ; } for( c=32; c<256; c++ ) /* the corresponding uppercase letters */ if( is_cons[ c ] ) is_cons[ c-32 ] = 1; is_cons[ 223 ] = 1; /* "ß" */ }